In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import plotly.express as px
from matplotlib import pyplot as plt
%matplotlib inline
#html export
import plotly.io as pio
pio.renderers.default = 'notebook'
In [2]:
df = pd.read_excel('TRAFFIC ACCIDENTS DATA.xlsx')
df
Out[2]:
| Date | Accident Spot | Area | County | Road/ Highway | Brief Accident Details/Cause | Victims | Total people confirmed dead | Time of the Accidents | Weather conditions | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2023-08-08 | Sobea | Sobea | Nakuru | Nakuru-Eldoret Highway | Head on Collision | Passengers | 4 | 4.30 pm | NaN |
| 1 | 2023-08-07 | Maai-Mahiu | Naivasha | Nakuru | Maai-Mahiu Naivasha Highway | vehicle and motorcycle collision | Passengers | 1 | 5.50 pm | NaN |
| 2 | 2023-07-25 | Ntulele | Ntulele | Narok | Narok Mai Mahiu road | Head on Collision | Drivers/Occupants | 4 | NaN | NaN |
| 3 | 2022-12-02 | Suswa | Suswa | Narok | Narok Mai Mahiu road | Head on Collision | Driver and passengers | 3 | 6.00 pm | NaN |
| 4 | 2022-12-01 | Mutira | Mutira | Kirinyaga | Kerugoya-Karatina Road | Run over | Pedestrian | 1 | NaN | NaN |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 76 | 2022-06-03 | Losengeli | Sabatia | Vihiga | Nairobi Kakamega road | Bus lost control and rolled over | Passengers | 1 | NaN | NaN |
| 77 | 2022-05-31 | Isinya | Isinya | Kajiado | Namanga Road | Car rammed into a trailer | Passengers | 1 | 11.00 pm | NaN |
| 78 | 2022-05-30 | Kinugi | Kinugi | Nakuru | Naivasha Nairobi Highway | Truck driver collided with an oncoming matatu | Passengers | 1 | NaN | NaN |
| 79 | 2022-02-28 | Kinugi | Kinugi | Nakuru | Nakuru - Nairobi highway | Car rammed into a lorry | Passengers | 4 | NaN | NaN |
| 80 | 2023-08-08 | Sobea | Sobea | Nakuru | Nakuru Eldoret Highway | matatu collided with a truck | Passengers | 2 | 4.30 am | NaN |
81 rows × 10 columns
Columns and Description¶
- Date - Contains dates of the accidents
- Accident Spot - Describes the specific location of the accident
- Area - General area where the accident occurred.
- County - County in which the accident happened.
- Road/ Highway - Road or highway where the accident occurred.
- Brief Accident Details/Cause - Short description of the accident cause or details.
- Victims - Specifies the type of victims involved (e.g., passengers, drivers).
- Total people confirmed dead - Indicates the number of fatalities.
- Time of the Accidents - The time when the accidents occurred.
- Weather conditions - Details about weather conditions during the accidents.
Preparing the dataset¶
In [3]:
df.columns
Out[3]:
Index(['Date', 'Accident Spot', 'Area', 'County', 'Road/ Highway',
'Brief Accident Details/Cause', 'Victims',
'Total people confirmed dead', 'Time of the Accidents',
'Weather conditions'],
dtype='object')
In [4]:
# make column names and values uniform
df.rename(columns={'Road/ Highway': 'Road/Highway'}, inplace=True) # renaming a column
df[ 'Total people confirmed dead'] = df[ 'Total people confirmed dead'].apply(pd.to_numeric, errors='coerce')
df.columns = df.columns.str.lower().str.replace(' ', '_')
categorical_columns = df.dtypes[df.dtypes == 'object'].index
for c in categorical_columns:
df[c] = df[c].str.lower().str.replace(' ', '_')
df
Out[4]:
| date | accident_spot | area | county | road/highway | brief_accident_details/cause | victims | total_people_confirmed_dead | time_of_the_accidents | weather_conditions | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2023-08-08 | sobea | sobea | nakuru | nakuru-eldoret_highway | head_on_collision | passengers | 4.0 | 4.30_pm | NaN |
| 1 | 2023-08-07 | maai-mahiu | naivasha | nakuru | maai-mahiu_naivasha_highway | vehicle_and_motorcycle_collision | passengers | 1.0 | 5.50_pm | NaN |
| 2 | 2023-07-25 | ntulele | ntulele | narok | narok_mai_mahiu_road | head_on_collision | drivers/occupants | 4.0 | NaN | NaN |
| 3 | 2022-12-02 | suswa | suswa | narok | narok_mai_mahiu_road | head_on_collision | driver_and_passengers | 3.0 | 6.00_pm | NaN |
| 4 | 2022-12-01 | mutira | mutira | kirinyaga | kerugoya-karatina_road | run_over | pedestrian | 1.0 | NaN | NaN |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 76 | 2022-06-03 | losengeli | sabatia | vihiga | nairobi_kakamega_road | bus_lost_control_and_rolled_over | passengers | 1.0 | NaN | NaN |
| 77 | 2022-05-31 | isinya | isinya | kajiado | namanga_road | car_rammed_into_a_trailer | passengers | 1.0 | 11.00_pm | NaN |
| 78 | 2022-05-30 | kinugi | kinugi | nakuru | naivasha_nairobi_highway | truck_driver_collided_with_an_oncoming_matatu | passengers | 1.0 | NaN | NaN |
| 79 | 2022-02-28 | kinugi | kinugi | nakuru | nakuru_-_nairobi_highway_ | car_rammed_into_a_lorry | passengers | 4.0 | NaN | NaN |
| 80 | 2023-08-08 | sobea | sobea | nakuru | nakuru_eldoret_highway | matatu_collided_with_a_truck | passengers | 2.0 | 4.30_am | NaN |
81 rows × 10 columns
In [5]:
df.dtypes
Out[5]:
date datetime64[ns] accident_spot object area object county object road/highway object brief_accident_details/cause object victims object total_people_confirmed_dead float64 time_of_the_accidents object weather_conditions object dtype: object
In [6]:
# converting specified columns
# Convert to integer
df['total_people_confirmed_dead'] = df['total_people_confirmed_dead'].astype('Int64')
# Convert to 24-hour time format
# Step 1: Replace time strings to make them compatible
df['time_of_the_accidents'] = df['time_of_the_accidents'].replace(
r'(\d{1,2})\.(\d{1,2})_?([ap]m)', # Allow missing underscore
r'\1:\2 \3',
regex=True
)
# Step 2: Convert to datetime, handling AM/PM
df['time_of_the_accidents'] = pd.to_datetime(
df['time_of_the_accidents'], format='%I:%M %p', errors='coerce'
)
# Step 3: Extract only the time in HH:MM format
df['time_of_the_accidents'] = df['time_of_the_accidents'].dt.strftime('%H:%M')
df
Out[6]:
| date | accident_spot | area | county | road/highway | brief_accident_details/cause | victims | total_people_confirmed_dead | time_of_the_accidents | weather_conditions | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2023-08-08 | sobea | sobea | nakuru | nakuru-eldoret_highway | head_on_collision | passengers | 4 | 16:30 | NaN |
| 1 | 2023-08-07 | maai-mahiu | naivasha | nakuru | maai-mahiu_naivasha_highway | vehicle_and_motorcycle_collision | passengers | 1 | 17:50 | NaN |
| 2 | 2023-07-25 | ntulele | ntulele | narok | narok_mai_mahiu_road | head_on_collision | drivers/occupants | 4 | NaN | NaN |
| 3 | 2022-12-02 | suswa | suswa | narok | narok_mai_mahiu_road | head_on_collision | driver_and_passengers | 3 | 18:00 | NaN |
| 4 | 2022-12-01 | mutira | mutira | kirinyaga | kerugoya-karatina_road | run_over | pedestrian | 1 | NaN | NaN |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 76 | 2022-06-03 | losengeli | sabatia | vihiga | nairobi_kakamega_road | bus_lost_control_and_rolled_over | passengers | 1 | NaN | NaN |
| 77 | 2022-05-31 | isinya | isinya | kajiado | namanga_road | car_rammed_into_a_trailer | passengers | 1 | 23:00 | NaN |
| 78 | 2022-05-30 | kinugi | kinugi | nakuru | naivasha_nairobi_highway | truck_driver_collided_with_an_oncoming_matatu | passengers | 1 | NaN | NaN |
| 79 | 2022-02-28 | kinugi | kinugi | nakuru | nakuru_-_nairobi_highway_ | car_rammed_into_a_lorry | passengers | 4 | NaN | NaN |
| 80 | 2023-08-08 | sobea | sobea | nakuru | nakuru_eldoret_highway | matatu_collided_with_a_truck | passengers | 2 | 04:30 | NaN |
81 rows × 10 columns
In [7]:
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 81 entries, 0 to 80 Data columns (total 10 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 date 80 non-null datetime64[ns] 1 accident_spot 81 non-null object 2 area 81 non-null object 3 county 81 non-null object 4 road/highway 80 non-null object 5 brief_accident_details/cause 79 non-null object 6 victims 63 non-null object 7 total_people_confirmed_dead 71 non-null Int64 8 time_of_the_accidents 25 non-null object 9 weather_conditions 1 non-null object dtypes: Int64(1), datetime64[ns](1), object(8) memory usage: 6.5+ KB
Data Cleaning¶
In [8]:
# duplicates
df.duplicated().sum()
Out[8]:
1
In [9]:
#dropping duplicates
df = df.drop_duplicates()
In [10]:
df.duplicated().sum()
Out[10]:
0
In [11]:
# nulls
df.isna().sum()
Out[11]:
date 1 accident_spot 0 area 0 county 0 road/highway 1 brief_accident_details/cause 2 victims 17 total_people_confirmed_dead 9 time_of_the_accidents 55 weather_conditions 79 dtype: int64
In [12]:
(df['weather_conditions'].isna().sum()/len(df)) * 100
Out[12]:
98.75
Approximately 99% of the 'weather_conditions' column is null hence safe to drop this column
In [13]:
# dropping 'weather_conditions' column
del df['weather_conditions']
In [14]:
# date
df[df['date'].isna()]
Out[14]:
| date | accident_spot | area | county | road/highway | brief_accident_details/cause | victims | total_people_confirmed_dead | time_of_the_accidents | |
|---|---|---|---|---|---|---|---|---|---|
| 66 | NaT | ndarugo | ndarugo | kiambu | thika_road_ | truck_lost_control_and_overturned | NaN | 0 | 10:00 |
In [15]:
df.iloc[60:70]
Out[15]:
| date | accident_spot | area | county | road/highway | brief_accident_details/cause | victims | total_people_confirmed_dead | time_of_the_accidents | |
|---|---|---|---|---|---|---|---|---|---|
| 61 | 2023-04-01 | amabuko | kroka_town | kisii | kisii_keroka_road | matatu_collided_with_a_trailer | passengers | 5 | NaN |
| 62 | 2023-03-30 | kayole_bridge | nakuru | nakuru | nakuru_-_nairobi_highway_ | bus_collided_with_a_matatu | passengers | 18 | NaN |
| 63 | 2023-03-31 | burguret_shopping_centre | burguret | muranga | naromoru_nanyuki_road | bus_overturned | passengers | 0 | NaN |
| 64 | 2023-04-18 | maragua | maragua | muranga | kenol_muranga_road | matatu_collided_with_a_lorry | passengers | 2 | NaN |
| 65 | 2023-08-08 | migori_town | migori_town | migori | migori_isibania_road | truck_failed_brakes_and_ran_into_ither_vehicles | pedestrians | 8 | 07:30 |
| 66 | NaT | ndarugo | ndarugo | kiambu | thika_road_ | truck_lost_control_and_overturned | NaN | 0 | 10:00 |
| 67 | 2023-02-04 | kakwamunyen | kakuma | turkana | lodwar_kakuma_road | driver_swerved_to_avoid_on_hitting_a_camel_on_... | passengers | 14 | 22:00 |
| 68 | 2023-01-29 | mamboleo | mamboleo | kisumu | kakamega_kisumu_road | bus_veered_off_the_road_and_landed_in_a_ditch | passengers | 0 | NaN |
| 69 | 2023-01-26 | kikopey | kikopey | nakuru | nakuru_-_nairobi_highway_ | a_matatu_rammed_into_a_lorry | passengers | 7 | NaN |
| 70 | 2023-09-08 | iibisil_towm | iibisil | kajiado | namanga_road | sallon_car_rammed_iinto_a_lorry | passengers | 3 | NaN |
In [16]:
# the sorrounding data for the null date is around the same period
# Use .loc to fill the 'date' column forward
df.loc[:, 'date'] = df['date'].ffill()
In [17]:
df.isna().sum()
Out[17]:
date 0 accident_spot 0 area 0 county 0 road/highway 1 brief_accident_details/cause 2 victims 17 total_people_confirmed_dead 9 time_of_the_accidents 55 dtype: int64
In [18]:
# road/highway
df[df['road/highway'].isna()]
Out[18]:
| date | accident_spot | area | county | road/highway | brief_accident_details/cause | victims | total_people_confirmed_dead | time_of_the_accidents | |
|---|---|---|---|---|---|---|---|---|---|
| 72 | 2022-07-16 | koru | koru | kisumu | NaN | bus_rolled_as_it_tried_avoiding_collision_with... | passengers | 0 | 14:00 |
In [19]:
df[df['county'] == 'kisumu']
Out[19]:
| date | accident_spot | area | county | road/highway | brief_accident_details/cause | victims | total_people_confirmed_dead | time_of_the_accidents | |
|---|---|---|---|---|---|---|---|---|---|
| 22 | 2022-06-15 | koru | muhoroni | kisumu | kisumu_highway_ | bus_lost_control_and_rolled | passengers_and_driver | <NA> | NaN |
| 24 | 2023-08-29 | ojolla | ojolla | kisumu | kisumu_-_busia_road | head_on_collision_ | passengers_and_drivers | <NA> | NaN |
| 39 | 2023-08-28 | ojola | ojola | kisumu | kisumu_-_busia_road | driver_tried_to_overtake,_rammed_into_a_traile... | passenger | 1 | NaN |
| 68 | 2023-01-29 | mamboleo | mamboleo | kisumu | kakamega_kisumu_road | bus_veered_off_the_road_and_landed_in_a_ditch | passengers | 0 | NaN |
| 71 | 2022-08-30 | coptic_round | coptic | kisumu | kisumu_kakamega_road | truck_lost_control_and_overturned | pedestrians | 3 | NaN |
| 72 | 2022-07-16 | koru | koru | kisumu | NaN | bus_rolled_as_it_tried_avoiding_collision_with... | passengers | 0 | 14:00 |
There is no much supporting data for the null 'road/highway' column hence safe to drop it
In [20]:
# Remove rows where 'road/highway' is null
df = df.dropna(subset=['road/highway'])
df = df.reset_index(drop=True)
In [21]:
df.isna().sum()
Out[21]:
date 0 accident_spot 0 area 0 county 0 road/highway 0 brief_accident_details/cause 2 victims 17 total_people_confirmed_dead 9 time_of_the_accidents 55 dtype: int64
In [22]:
# brief_accident_details/cause
df[df['brief_accident_details/cause'].isna()]
Out[22]:
| date | accident_spot | area | county | road/highway | brief_accident_details/cause | victims | total_people_confirmed_dead | time_of_the_accidents | |
|---|---|---|---|---|---|---|---|---|---|
| 43 | 2023-04-02 | masaba | masaba | kisii | kisii_keroka_road | NaN | NaN | <NA> | NaN |
| 58 | 2023-04-21 | mau_summit | kuresoi_north | nakuru | nakuru_eldoret_highway | NaN | pedestrian | 1 | 18:30 |
In [23]:
df[df['road/highway'] == 'kisii_keroka_road']
Out[23]:
| date | accident_spot | area | county | road/highway | brief_accident_details/cause | victims | total_people_confirmed_dead | time_of_the_accidents | |
|---|---|---|---|---|---|---|---|---|---|
| 43 | 2023-04-02 | masaba | masaba | kisii | kisii_keroka_road | NaN | NaN | <NA> | NaN |
| 60 | 2023-04-01 | amabuko | kroka_town | kisii | kisii_keroka_road | matatu_collided_with_a_trailer | passengers | 5 | NaN |
In [24]:
# Filter rows where road/highway is "kisii_keroka_road"
kisii_keroka_road_rows = df[df['road/highway'] == 'kisii_keroka_road']
# Use loc to apply backward fill on the slice
df.loc[kisii_keroka_road_rows.index, 'brief_accident_details/cause'] = kisii_keroka_road_rows['brief_accident_details/cause'].bfill()
In [25]:
df[df['road/highway'] == 'nakuru_eldoret_highway']
Out[25]:
| date | accident_spot | area | county | road/highway | brief_accident_details/cause | victims | total_people_confirmed_dead | time_of_the_accidents | |
|---|---|---|---|---|---|---|---|---|---|
| 26 | 2023-07-09 | ngata_bridge | ngata | nakuru | nakuru_eldoret_highway | matatu_and_truck_head_on_collision | passenger | 1 | 22:00 |
| 30 | 2023-05-15 | sachangwan | sachangwan | bomet | nakuru_eldoret_highway | matatu_hit_a_bump,_veered_off_the_road_and_lan... | NaN | 5 | 02:00 |
| 35 | 2023-06-24 | migaa | mau_hills | nakuru | nakuru_eldoret_highway | matatu_driver_lot_ontrol_and_hit_a_lorry | driver_and_passengers | 3 | NaN |
| 37 | 2023-09-26 | ainabkoi_intersection | ainabkoi | uansin_gishi | nakuru_eldoret_highway | bus_collided_with_a_saloon_car | NaN | 0 | NaN |
| 48 | 2023-08-08 | sobea | sobea | nakuru | nakuru_eldoret_highway | matatu_collided_with_a_truck | passengers | 2 | NaN |
| 56 | 2023-06-24 | migaa | mau_hills | nakuru | nakuru_eldoret_highway | matatu_driver_lost_control_and_hit_a_lorry | passenger | 3 | NaN |
| 57 | 2023-05-15 | sachangwan | sachangwan | nakuru | nakuru_eldoret_highway | matatu_hit_a_bump_and_veered_off_the_road | passengers | 5 | 02:00 |
| 58 | 2023-04-21 | mau_summit | kuresoi_north | nakuru | nakuru_eldoret_highway | NaN | pedestrian | 1 | 18:30 |
| 78 | 2023-08-08 | sobea | sobea | nakuru | nakuru_eldoret_highway | matatu_collided_with_a_truck | passengers | 2 | 04:30 |
In [26]:
# Filter rows where road/highway is "knakuru_eldoret_highway"
nakuru_eldoret_highway_rows = df[df['road/highway'] == 'nakuru_eldoret_highway']
# Use loc to apply forward fill on the slice as tha cause above would fit the victim
df.loc[nakuru_eldoret_highway_rows.index, 'brief_accident_details/cause'] = nakuru_eldoret_highway_rows['brief_accident_details/cause'].ffill()
In [27]:
df.isna().sum()
Out[27]:
date 0 accident_spot 0 area 0 county 0 road/highway 0 brief_accident_details/cause 0 victims 17 total_people_confirmed_dead 9 time_of_the_accidents 55 dtype: int64
In [28]:
# total_people_confirmed_dead
df[df['total_people_confirmed_dead'].isna()]
Out[28]:
| date | accident_spot | area | county | road/highway | brief_accident_details/cause | victims | total_people_confirmed_dead | time_of_the_accidents | |
|---|---|---|---|---|---|---|---|---|---|
| 12 | 2021-02-23 | nithi | nithi_bridge | tharaka_nithi | meru_embu_road | head_on_collision | NaN | <NA> | 02:00 |
| 14 | 2020-08-17 | river_maara | river_maara | tharaka_nithi | meru_embu_road | vehicle_rolled_into_the_river | NaN | <NA> | NaN |
| 21 | 2023-09-27 | sotik | sotik | bomet | bomet_-_sotik_highway | bodaboda_rider_hit_by_matatu | rider | <NA> | NaN |
| 22 | 2022-06-15 | koru | muhoroni | kisumu | kisumu_highway_ | bus_lost_control_and_rolled | passengers_and_driver | <NA> | NaN |
| 23 | 2021-08-29 | londiani | londiani | nakuru | nakuru_kericho_highway | vehicle_lost_control_nd_rolled | NaN | <NA> | NaN |
| 24 | 2023-08-29 | ojolla | ojolla | kisumu | kisumu_-_busia_road | head_on_collision_ | passengers_and_drivers | <NA> | NaN |
| 25 | 2021-08-25 | greensteads | greensteads | nakuru | nakuru_-_nairobi_highway_ | head_on_collision | passengers | <NA> | NaN |
| 27 | 2023-01-13 | ratili | ratili_narok_south | narok | narok_road | saloom_car_rolled | passangers_and_driver | <NA> | NaN |
| 43 | 2023-04-02 | masaba | masaba | kisii | kisii_keroka_road | matatu_collided_with_a_trailer | NaN | <NA> | NaN |
If we have no information on the victims, then we cant assign a number to the total people confirmed dead hence we drop rows where both victims and total confirmed dead are null
In [29]:
df[df['victims'].isna() & df['total_people_confirmed_dead'].isna()]
Out[29]:
| date | accident_spot | area | county | road/highway | brief_accident_details/cause | victims | total_people_confirmed_dead | time_of_the_accidents | |
|---|---|---|---|---|---|---|---|---|---|
| 12 | 2021-02-23 | nithi | nithi_bridge | tharaka_nithi | meru_embu_road | head_on_collision | NaN | <NA> | 02:00 |
| 14 | 2020-08-17 | river_maara | river_maara | tharaka_nithi | meru_embu_road | vehicle_rolled_into_the_river | NaN | <NA> | NaN |
| 23 | 2021-08-29 | londiani | londiani | nakuru | nakuru_kericho_highway | vehicle_lost_control_nd_rolled | NaN | <NA> | NaN |
| 43 | 2023-04-02 | masaba | masaba | kisii | kisii_keroka_road | matatu_collided_with_a_trailer | NaN | <NA> | NaN |
In [30]:
# Drop rows where both 'victims' and 'total_people_confirmed_dead' are null
df = df.dropna(subset=['victims', 'total_people_confirmed_dead'], how='all')
In [31]:
df[df['total_people_confirmed_dead'].isna()]
Out[31]:
| date | accident_spot | area | county | road/highway | brief_accident_details/cause | victims | total_people_confirmed_dead | time_of_the_accidents | |
|---|---|---|---|---|---|---|---|---|---|
| 21 | 2023-09-27 | sotik | sotik | bomet | bomet_-_sotik_highway | bodaboda_rider_hit_by_matatu | rider | <NA> | NaN |
| 22 | 2022-06-15 | koru | muhoroni | kisumu | kisumu_highway_ | bus_lost_control_and_rolled | passengers_and_driver | <NA> | NaN |
| 24 | 2023-08-29 | ojolla | ojolla | kisumu | kisumu_-_busia_road | head_on_collision_ | passengers_and_drivers | <NA> | NaN |
| 25 | 2021-08-25 | greensteads | greensteads | nakuru | nakuru_-_nairobi_highway_ | head_on_collision | passengers | <NA> | NaN |
| 27 | 2023-01-13 | ratili | ratili_narok_south | narok | narok_road | saloom_car_rolled | passangers_and_driver | <NA> | NaN |
In [32]:
df[df['victims'] == 'rider']
Out[32]:
| date | accident_spot | area | county | road/highway | brief_accident_details/cause | victims | total_people_confirmed_dead | time_of_the_accidents | |
|---|---|---|---|---|---|---|---|---|---|
| 21 | 2023-09-27 | sotik | sotik | bomet | bomet_-_sotik_highway | bodaboda_rider_hit_by_matatu | rider | <NA> | NaN |
In [33]:
# Fill 'total_people_confirmed_dead' with 1 where 'victims' is 'rider'
df.loc[df['victims'] == 'rider', 'total_people_confirmed_dead'] = 1
In [34]:
# central tendancies of total people confirmed dead
max = df['total_people_confirmed_dead'].max()
min = df['total_people_confirmed_dead'].min()
mean = np.mean(df['total_people_confirmed_dead'])
mode = df['total_people_confirmed_dead'].mode()[0]
median = df['total_people_confirmed_dead'].median()
print(f"Max: {max}")
print(f"Min: {min}")
print(f"Mean: {mean}")
print(f"Mode: {mode}")
print(f"Median: {median}")
Max: 52 Min: 0 Mean: 4.957746478873239 Mode: 1 Median: 3.0
In [35]:
# trying to see the distribution of dead people values
px.box(df['total_people_confirmed_dead'])
The distribution has outliers hence it would not be appropriate to replace with mean but rather the median as the nulls have both passengers and driver as the victims meaning they are more than one
In [36]:
# Replace nulls in 'total_people_confirmed_dead' with the column's median
df['total_people_confirmed_dead'] = df['total_people_confirmed_dead'].fillna(df['total_people_confirmed_dead'].median())
In [37]:
df.isna().sum()
Out[37]:
date 0 accident_spot 0 area 0 county 0 road/highway 0 brief_accident_details/cause 0 victims 13 total_people_confirmed_dead 0 time_of_the_accidents 52 dtype: int64
In [38]:
# victims
df['victims'].value_counts()
Out[38]:
victims passengers 36 driver_and_passengers 4 passengers_and_driver 3 passenger 3 pedestrians 3 pedestrian 2 occupants_of_the_small_vehicle_ 2 passengers_and_drivers 2 drivers/occupants 1 passengers_and_pedestrians_ 1 drivers,passengers_and_pedestrians 1 rider 1 passangers_and_driver 1 driver_and_other_motorists_ 1 motorist 1 Name: count, dtype: int64
In [39]:
# Dictionary to map similar names to a unified value
mapping_dict = {
'passengers': 'passengers',
'driver_and_passengers': 'driver_and_passengers',
'passengers_and_driver': 'driver_and_passengers',
'passenger': 'passengers',
'pedestrians': 'pedestrians',
'pedestrian': 'pedestrians',
'occupants_of_the_small_vehicle_': 'occupants_of_the_small_vehicle_',
'passengers_and_drivers': 'driver_and_passengers',
'drivers/occupants': 'drivers/occupants',
'passengers_and_pedestrians_': 'passengers_and_pedestrians',
'drivers,passengers_and_pedestrians': 'driver_passenger_pedestrian',
'rider': 'motorist',
'passangers_and_driver': 'driver_and_passengers',
'driver_and_other_motorists_': 'driver_and_motorists',
'motorist': 'motorist'
}
# Replace values in the 'victims' column based on the mapping
df['victims'] = df['victims'].replace(mapping_dict)
# Check updated value counts
df['victims'].value_counts()
Out[39]:
victims passengers 39 driver_and_passengers 10 pedestrians 5 occupants_of_the_small_vehicle_ 2 motorist 2 drivers/occupants 1 passengers_and_pedestrians 1 driver_passenger_pedestrian 1 driver_and_motorists 1 Name: count, dtype: int64
In [40]:
df[df['victims'].isna()]
Out[40]:
| date | accident_spot | area | county | road/highway | brief_accident_details/cause | victims | total_people_confirmed_dead | time_of_the_accidents | |
|---|---|---|---|---|---|---|---|---|---|
| 28 | 2023-05-22 | isinya | isinya | kajiado | isinya_kiserian_road | bus_lost_control_and_rolled_into_a_ditch | NaN | 7 | NaN |
| 30 | 2023-05-15 | sachangwan | sachangwan | bomet | nakuru_eldoret_highway | matatu_hit_a_bump,_veered_off_the_road_and_lan... | NaN | 5 | 02:00 |
| 36 | 2023-09-27 | laisamis | laisamis | isiolo | marsabit_isiolo_highway | vehicle_veered_off_rhe_road_after_tyre_burst | NaN | 2 | NaN |
| 37 | 2023-09-26 | ainabkoi_intersection | ainabkoi | uansin_gishi | nakuru_eldoret_highway | bus_collided_with_a_saloon_car | NaN | 0 | NaN |
| 42 | 2023-04-16 | josa | mwatate | taita_taveta | mombasa_-_nairobi_highway_ | head_on_collision | NaN | 10 | NaN |
| 44 | 2023-09-18 | gitaru | gitaru | kiambu | nakuru_-_nairobi_highway_ | driver_lost_control_and_hit_barrier | NaN | 0 | NaN |
| 45 | 2023-09-08 | malili | malili | makueni | mombasa_-_nairobi_highway_ | involved_a_truck,_lorry_and_saloon_car | NaN | 0 | 16:20 |
| 47 | 2023-08-30 | nairagie-enkare | narok | narok | narok_mai_mahiu_road | bus_burst_into_flames | NaN | 0 | NaN |
| 51 | 2023-07-04 | mau_summit | kuresoi_north | nakuru | nakuru_kericho_highway | bus_lost_control_and_landed_in_a_ditch | NaN | 0 | NaN |
| 52 | 2022-04-26 | kirinyaga | kirinyaga | kirinyaga | rukenya_kimunye_road | bus_overturned | NaN | 0 | NaN |
| 55 | 2021-12-03 | tuthamba | kirinyaga | kirinyaga | sagana_kagio_road | matatu_collided_head_on_with_a_saloon_car | NaN | 0 | 21:00 |
| 59 | 2023-04-18 | naivasha | naivasha | nakuru | nakuru_-_nairobi_highway_ | matatu_collided_with_a_lorry | NaN | 5 | NaN |
| 65 | 2023-08-08 | ndarugo | ndarugo | kiambu | thika_road_ | truck_lost_control_and_overturned | NaN | 0 | 10:00 |
In [41]:
# Get the top 2 value counts
top_2_values = df['victims'].value_counts().head(2).index
# Randomly choose from the top 2 values and fill the NaNs
df.loc[df['victims'].isna(), 'victims'] = np.random.choice(top_2_values, size=df['victims'].isna().sum())
In [42]:
df.isna().sum()
Out[42]:
date 0 accident_spot 0 area 0 county 0 road/highway 0 brief_accident_details/cause 0 victims 0 total_people_confirmed_dead 0 time_of_the_accidents 52 dtype: int64
In [43]:
# time
df['time_of_the_accidents'].unique()
Out[43]:
array(['16:30', '17:50', nan, '18:00', '12:30', '02:00', '16:20', '18:30',
'21:00', '22:00', '23:30', '13:00', '19:30', '07:30', '10:00',
'23:00', '04:30'], dtype=object)
In [44]:
mapping_dict = {
'16:30': 'afternoon',
'17:50': 'evening',
'18:00': 'evening',
'12:30': 'afternoon',
'02:00': 'night',
'16:20': 'afternoon',
'18:30': 'evening',
'21:00': 'night',
'22:00': 'night',
'23:30': 'night',
'13:00': 'afternoon',
'19:30': 'night',
'07:30': 'morning',
'10:00': 'morning',
'23:00': 'night',
'04:30': 'night'
}
# Replace values in the 'time_of_the_accidents' column based on the mapping
df['time_of_the_accidents'] = df['time_of_the_accidents'].replace(mapping_dict)
# Check updated value counts
df['time_of_the_accidents'].value_counts()
Out[44]:
time_of_the_accidents night 12 afternoon 5 evening 4 morning 2 Name: count, dtype: int64
In [45]:
df[df['time_of_the_accidents'].isna()].head(10)
Out[45]:
| date | accident_spot | area | county | road/highway | brief_accident_details/cause | victims | total_people_confirmed_dead | time_of_the_accidents | |
|---|---|---|---|---|---|---|---|---|---|
| 2 | 2023-07-25 | ntulele | ntulele | narok | narok_mai_mahiu_road | head_on_collision | drivers/occupants | 4 | NaN |
| 4 | 2022-12-01 | mutira | mutira | kirinyaga | kerugoya-karatina_road | run_over | pedestrians | 1 | NaN |
| 5 | 2022-08-06 | mlima_swara | mlima_swara | murang'a | kenol-sagana_road | car_hit_a_stationary_lorry | passengers | 5 | NaN |
| 6 | 2022-07-27 | nithi | nithi_bridge | tharaka_nithi | meru_embu_road | driver_lost_control_and_swerved_off_the_bridge | passengers | 6 | NaN |
| 7 | 2022-07-28 | makuyu | makuyu | murang'a | kenol-sagana_road | vehicle_rammed_into_a_lorry | passengers | 7 | NaN |
| 8 | 2022-06-28 | nithi | nithi_bridge | tharaka_nithi | meru_embu_road | vehicle_rolled_into_the_bridge | passengers | 4 | NaN |
| 9 | 2021-09-21 | river_tungu | river_tungu | tharaka_nithi | meru_embu_road | vehicle_rolled_into_the_river | passengers | 5 | NaN |
| 10 | 2021-09-27 | nithi | nithi_bridge | tharaka_nithi | meru_embu_road | vehicle_rolled_into_the_bridge | passengers | 4 | NaN |
| 15 | 2023-09-08 | mlima_kiu | salama | makueni | mombasa_-_nairobi_highway_ | matatu_collided_head_on_with_an_oncoming_truck | passengers | 4 | NaN |
| 16 | 2023-04-18 | naivasha | delamere_farm | naivasha | nairobi_-_nakuru_highway | head_on_collision_matatu_and_lorry | passengers | 6 | NaN |
In [46]:
df[df['road/highway'] == 'narok_mai_mahiu_road']
Out[46]:
| date | accident_spot | area | county | road/highway | brief_accident_details/cause | victims | total_people_confirmed_dead | time_of_the_accidents | |
|---|---|---|---|---|---|---|---|---|---|
| 2 | 2023-07-25 | ntulele | ntulele | narok | narok_mai_mahiu_road | head_on_collision | drivers/occupants | 4 | NaN |
| 3 | 2022-12-02 | suswa | suswa | narok | narok_mai_mahiu_road | head_on_collision | driver_and_passengers | 3 | evening |
| 47 | 2023-08-30 | nairagie-enkare | narok | narok | narok_mai_mahiu_road | bus_burst_into_flames | driver_and_passengers | 0 | NaN |
| 49 | 2023-07-25 | ntulele | narok | narok | narok_mai_mahiu_road | truck_collided_head_on_with_another_truck | passengers | 3 | NaN |
| 50 | 2022-12-02 | suswa | suswa | narok | narok_mai_mahiu_road | head_on_collision | driver_and_passengers | 1 | NaN |
In [47]:
# filling time based on the most frequent time for each 'road/highway'
# Group by 'road/highway' and get the most frequent time for each group
most_frequent_time = df.groupby('road/highway')['time_of_the_accidents'].transform(lambda x: x.mode().iloc[0] if not x.mode().empty else np.nan)
# Fill null values in 'time of accident' with the most frequent time per 'road/highway'
df['time_of_the_accidents'] = df['time_of_the_accidents'].fillna(most_frequent_time)
# finding out the nulls after applying this formula
df['time_of_the_accidents'].isna().sum()
Out[47]:
30
In [48]:
# filling time based on the most frequent time for each 'accident_spot'
# Group by 'accident_spot' and get the most frequent time for each group
most_frequent_time = df.groupby('accident_spot')['time_of_the_accidents'].transform(lambda x: x.mode().iloc[0] if not x.mode().empty else np.nan)
# Fill null values in 'time of accident' with the most frequent time per 'accident_spot'
df['time_of_the_accidents'] = df['time_of_the_accidents'].fillna(most_frequent_time)
# finding out the nulls after applying this formula
df['time_of_the_accidents'].isna().sum()
Out[48]:
25
In [49]:
# filling time based on the most frequent time for each 'area'
# Group by 'area' and get the most frequent time for each group
most_frequent_time = df.groupby('area')['time_of_the_accidents'].transform(lambda x: x.mode().iloc[0] if not x.mode().empty else np.nan)
# Fill null values in 'time of accident' with the most frequent time per 'area'
df['time_of_the_accidents'] = df['time_of_the_accidents'].fillna(most_frequent_time)
# finding out the nulls after applying this formula
df['time_of_the_accidents'].isna().sum()
Out[49]:
24
In [50]:
# filling time based on the most frequent time for each 'county'
# Group by ''county' and get the most frequent time for each group
most_frequent_time = df.groupby('county')['time_of_the_accidents'].transform(lambda x: x.mode().iloc[0] if not x.mode().empty else np.nan)
# Fill null values in 'time of accident' with the most frequent time per 'county'
df['time_of_the_accidents'] = df['time_of_the_accidents'].fillna(most_frequent_time)
# finding out the nulls after applying this formula
df['time_of_the_accidents'].isna().sum()
Out[50]:
21
In [51]:
# Forward fill for the remaining null values
df.loc[:, 'time_of_the_accidents'] = df['time_of_the_accidents'].ffill()
# finding out the nulls after applying this formula
df['time_of_the_accidents'].isna().sum()
Out[51]:
0
In [52]:
df = df.reset_index(drop=True)
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 75 entries, 0 to 74 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 date 75 non-null datetime64[ns] 1 accident_spot 75 non-null object 2 area 75 non-null object 3 county 75 non-null object 4 road/highway 75 non-null object 5 brief_accident_details/cause 75 non-null object 6 victims 75 non-null object 7 total_people_confirmed_dead 75 non-null Int64 8 time_of_the_accidents 75 non-null object dtypes: Int64(1), datetime64[ns](1), object(7) memory usage: 5.5+ KB
EDA (Exploratory Data Analysis)¶
In [53]:
# trying to see the trend of 'total_people_confirmed_dead' across the years
# Create the scatter plot
fig = px.scatter(
df,
x='date', # X-axis: time
y='total_people_confirmed_dead', # Y-axis: total confirmed dead
title='Total People Confirmed Dead Over Time',
labels={'date': 'Date', 'total_people_confirmed_dead': 'Total Confirmed Dead'},
template='plotly_dark' # Optional: Dark theme
)
# Show the plot
fig.show()
In [54]:
# trend of victims
# Group by 'victims' category and sum the total deaths for each category
victims_death_sum = df.groupby('victims')['total_people_confirmed_dead'].sum().reset_index()
# Sort the DataFrame by total_deaths in descending order
victims_death_sum = victims_death_sum.sort_values(by='total_people_confirmed_dead', ascending=False)
# Create the bar graph
fig = px.bar(
victims_death_sum,
x='victims', # X-axis: categories of victims
y='total_people_confirmed_dead', # Y-axis: sum of total deaths
title='Total Deaths by Victim Category',
labels={'victims': 'Victim Category', 'total_people_confirmed_dead': 'Total Deaths'},
text='total_people_confirmed_dead', # Display total deaths on top of bars
template='plotly_dark', # Optional: dark theme
)
# Adjust the layout for better readability
fig.update_layout(
xaxis_tickangle=-45, # Rotate x-axis labels for better readability
yaxis_title='Total Deaths',
xaxis_title='Victim Category',
)
# Show the plot
fig.show()
In [55]:
# distribution of the accident spots
fig = px.histogram(
df,
x='accident_spot', # X-axis: spot (location type or any other categorical data)
title='Distribution of Accident Spots',
labels={'spot': 'Spot'},
template='plotly_dark' # Optional: dark theme
)
# Adjust the layout for better readability
fig.update_layout(
xaxis_title='Spot',
yaxis_title='Count',
xaxis_tickangle=-45 # Rotate x-axis labels if necessary for better readability
)
# Show the plot
fig.show()